1 Introduction

The objective of this notebook is to refine the clustering annotation done at level 4. This refinement is the result of a manual curation carried out by specialists to remove poor quality cells, misclassified cells or clusters with very few cells.

2 Pre-processing

2.1 Load packages

library(Seurat)
library(Signac)
library(tidyverse)
library(reshape2)
library(ggpubr)
library(harmony)
library(unikn) 
library(plyr)

set.seed(333)

2.2 Parameters

cell_type = "PC"

path_to_obj <- str_c(
  here::here("scATAC-seq/results/R_objects/level_4/"),
  cell_type,
  "/",
  cell_type,
  "_integration_peak_calling_level_4.rds",
  sep = ""
)

path_to_save <- str_c(
  here::here("scATAC-seq/results/R_objects/level_5/"),
  cell_type,
    "/01.",
  cell_type,
  "_integrated_MBC_GCBC_level_5.rds",
  sep = ""
)

path_to_obj_RNA <- str_c(
  here::here("scRNA-seq/3-clustering/5-level_5/"),
  cell_type,
    "/",
  cell_type,
  "_seu_obj_level_5_eta.rds",
  sep = ""
)

path_to_obj_B_cell_lineage <- here::here("scATAC-seq/results/R_objects/B_cells_integrated.rds")
path_to_gcbc <- here::here("scATAC-seq/results/R_objects/level_4/GCBC/GCBC_integration_peak_calling_level_4.rds")

# Functions
source(here::here("scRNA-seq/bin/utils.R"))

3 Load scATAC data

3.1 Load PC cells

seurat <- readRDS(path_to_obj)
seurat
## An object of class Seurat 
## 81160 features across 1932 samples within 1 assay 
## Active assay: peaks_redefined (81160 features, 76624 variable features)
##  3 dimensional reductions calculated: umap, lsi, harmony
DimPlot(seurat, 
  group.by = "annotation_level_5",
  pt.size = 0.1)

3.2 Load GCBC cells

seurat_GCBC <- readRDS(path_to_gcbc)
seurat_GCBC
## An object of class Seurat 
## 142257 features across 21447 samples within 1 assay 
## Active assay: peaks_redefined (142257 features, 142254 variable features)
##  3 dimensional reductions calculated: umap, lsi, harmony
DimPlot(seurat_GCBC, 
  pt.size = 0.1, label = T)

3.2.1 Load entire B-cell lineage

seurat_B_cell_lineage <- readRDS(path_to_obj_B_cell_lineage)
seurat_B_cell_lineage
## An object of class Seurat 
## 270784 features across 64847 samples within 1 assay 
## Active assay: peaks_macs (270784 features, 269800 variable features)
##  3 dimensional reductions calculated: umap, lsi, harmony
DimPlot(seurat_B_cell_lineage, 
  pt.size = 0.1, label = T)

4 Load scRNA data

4.1 Load PC cells

seurat_RNA <- readRDS(path_to_obj_RNA)

DimPlot(seurat_RNA, 
        group.by = "names_level_5_clusters_eta",
        label = T,
        pt.size = 0.1) + NoLegend()

4.1.1 Clustering of cells into larger groups.

Due to the limited number of cells that we have in our scATAC dataset, we are going to merge related scRNAseq cells into larger groups to facilite cell annotation and interpretation.

seurat_RNA$annotation_level_5 <- revalue(seurat_RNA$names_level_5_clusters_eta,
c("Dark Zone GCBC"="Dark Zone GCBC",
"DZ migratory PC precursor"=NA,
"Light Zone GCBC"="Light Zone GCBC",             
"PC committed Light Zone GCBC"="PC committed Light Zone GCBC",    
"Early PC precursor"="Early PC precursor",             
"PB committed early PC precursor"="PB",
"Transitional PB"="PB",                 
"PB"="PB",                               
"IgG+ PC precursor"="IgG+ PC precursor",              
"preMature IgG+ PC"="preMature IgG+ PC",               
"Mature IgG+ PC"="Mature PC",                 
"MBC derived IgG+ PC"="Mature PC",            
"Mature IgA+ PC"="Mature PC",                 
"MBC derived IgA+ PC"="Mature PC",             
"class switch MBC"="class switch MBC",               
"MBC derived early PC precursor"="MBC derived PC precursor",  
"MBC derived PC precursor"="MBC derived PC precursor",        
"IgM+ early PC precursor"="Early PC precursor",        
"IgM+ PC precursor"="IgM+ PC precursor",                 
"preMature IgM+ PC"="preMature IgM+ PC",               
"Mature IgM+ PC"="Mature PC",                 
"Short lived IgM+ PC"=NA,             
"IgD PC precursor"="IgD PC precursor"))                            


DimPlot(seurat_RNA, 
        group.by = "annotation_level_5",
        label = T,
        pt.size = 0.1) 

general_counts <- table(seurat_RNA$annotation_level_5,
                        seurat_RNA$assay)
general_counts_melt <-  melt(general_counts)
ggbarplot(data = general_counts_melt,
          x = "Var1",
          y = "value",
         fill = "Var2", 
         color = "Var2", 
         palette = "Paired", 
         label = TRUE,
         orientation = "horiz",
         position = position_dodge(0.9))

seurat_only_multiome <- subset(seurat_RNA, assay == "multiome")
seurat_only_multiomes_melt <-  melt(table(seurat_only_multiome$annotation_level_5, 
seurat_only_multiome$age_group))

ggbarplot(data = seurat_only_multiomes_melt,
          x = "Var1",
          y = "value",
         fill = "Var2", 
         color = "Var2", 
         palette = "Paired", 
         label = TRUE,
         orientation = "horiz",
         position = position_dodge(0.9))

DimPlot(seurat_B_cell_lineage, 
  cols.highlight = "darkred", cols= "grey",
  cells.highlight= colnames(seurat_only_multiome),
  pt.size = 0.1)

5 Extracting key clusters from the entire B-cell lineage

5.1 Subclustering of GCBC LZ cells

# seurat_B_cell_lineage@graphs$graphname
seurat_B_cell_lineage <- FindSubCluster(
  seurat_B_cell_lineage,
  cluster = 3,
  graph.name = "peaks_macs_snn",
  subcluster.name = "LZ",
  resolution = 0.15)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 8315
## Number of edges: 276595
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9104
## Number of communities: 5
## Elapsed time: 0 seconds
DimPlot(seurat_B_cell_lineage, 
        group.by = "LZ", 
        label = T)

LZ_1 <- colnames(seurat_B_cell_lineage)[seurat_B_cell_lineage$LZ == "3_2" & seurat_B_cell_lineage$assay == "scATAC"]
LZ_1_m <- colnames(seurat_B_cell_lineage)[seurat_B_cell_lineage$LZ == "3_2" & seurat_B_cell_lineage$assay == "multiome"]
LZ_1_m_final <- LZ_1_m[which(LZ_1_m %in% colnames(seurat_only_multiome))]

LZ_2 <- colnames(seurat_B_cell_lineage)[seurat_B_cell_lineage$LZ == "3_3" & seurat_B_cell_lineage$assay == "scATAC"]
LZ_2_m <- colnames(seurat_B_cell_lineage)[seurat_B_cell_lineage$LZ == "3_3" & seurat_B_cell_lineage$assay == "multiome"]
LZ_2_m_final <- LZ_2_m[which(LZ_2_m %in% colnames(seurat_only_multiome))]

LZ_1.cells.use <- sample(x = c(LZ_1,LZ_1_m_final), size = 600)
LZ_2.cells.use <- sample(x = c(LZ_2,LZ_2_m_final), size = 600)

5.2 Subclustering of GCBC DZ cells

seurat_B_cell_lineage <- FindSubCluster(
  seurat_B_cell_lineage,
  cluster = 1,
  graph.name = "peaks_macs_snn",
  subcluster.name = "DZ",
  resolution = 0.2)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 11405
## Number of edges: 360759
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9009
## Number of communities: 5
## Elapsed time: 0 seconds
DimPlot(seurat_B_cell_lineage, 
        group.by = "DZ", 
        label = T)

DZ_1 <- colnames(seurat_B_cell_lineage)[seurat_B_cell_lineage$DZ == "1_0" & seurat_B_cell_lineage$assay == "scATAC"]
DZ_1_m <- colnames(seurat_B_cell_lineage)[seurat_B_cell_lineage$DZ == "1_0" & seurat_B_cell_lineage$assay == "multiome"]
DZ_1_m_final <- DZ_1_m[which(DZ_1_m %in% colnames(seurat_only_multiome))]

DZ_1.cells.use <- sample(x = c(DZ_1,DZ_1_m_final), size = 600)

5.2.1 Clean GCBC with the processed dataset

LZ_1.cells.use = subset(LZ_1.cells.use, !(LZ_1.cells.use %in% setdiff(LZ_1.cells.use,colnames(seurat_GCBC))))
LZ_2.cells.use = subset(LZ_2.cells.use, !(LZ_2.cells.use %in% setdiff(LZ_2.cells.use,colnames(seurat_GCBC))))
DZ_1.cells.use = subset(DZ_1.cells.use, !(DZ_1.cells.use %in% setdiff(DZ_1.cells.use,colnames(seurat_GCBC))))

5.3 Subclustering of csMBC cells

seurat_B_cell_lineage <- FindSubCluster(
  seurat_B_cell_lineage,
  cluster = 2,
  graph.name = "peaks_macs_snn",
  subcluster.name = "csMBC",
  resolution = 0.4)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
## 
## Number of nodes: 10806
## Number of edges: 312801
## 
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8260
## Number of communities: 21
## Elapsed time: 0 seconds
DimPlot(seurat_B_cell_lineage, 
        group.by = "csMBC",
        label = T)

csMBC <- colnames(seurat_B_cell_lineage)[seurat_B_cell_lineage$csMBC == "2_0" & seurat_B_cell_lineage$assay == "scATAC"]
csMBC_m <- colnames(seurat_B_cell_lineage)[seurat_B_cell_lineage$csMBC == "2_0" & seurat_B_cell_lineage$assay == "multiome"]
csMBC_m_final <- csMBC_m[which(csMBC_m %in% colnames(seurat_only_multiome))]

csMBC.cells.use <- sample(x = c(csMBC,csMBC_m_final), size = 600)

6 Extraction the barcodes of the selected cells

selecting_cells <- unique(c(colnames(seurat),
                     LZ_1.cells.use,
                     LZ_2.cells.use,
                     DZ_1.cells.use,
                     csMBC.cells.use))


DimPlot(seurat_B_cell_lineage, 
  cols.highlight = "brown1", 
  cols= "grey",
  cells.highlight= selecting_cells,
  pt.size = 0.1)

7 Extraction of Multiome cells

7.1 From scRNA assay

tonsil_RNA_annotation <- seurat_RNA@meta.data %>%
  dplyr::filter(assay == "multiome") %>%
  dplyr::select("barcode", "annotation_level_5","donor_id")

table(tonsil_RNA_annotation$donor_id)
## 
## BCLL-14-T BCLL-15-T  BCLL-2-T  BCLL-8-T  BCLL-9-T 
##       193       212      1065       456       652
print(paste("Number of PC from scRNA-seq multiome:", sum(table(tonsil_RNA_annotation$annotation_level_5))))
## [1] "Number of PC from scRNA-seq multiome: 2521"

7.2 From scATAC assay

tonsil_ATAC_cell_barcode <- seurat@meta.data %>%
  rownames_to_column(var = "cell_barcode") %>%
  dplyr::filter(assay == "multiome") %>%
  dplyr::select("cell_barcode","donor_id")

table(tonsil_ATAC_cell_barcode$donor_id)
## 
## BCLL-14-T BCLL-15-T  BCLL-8-T  BCLL-9-T 
##        68        87       237       261
print(paste("Number of PC from scATAC-seq multiome:", sum(table(tonsil_ATAC_cell_barcode$donor_id))))
## [1] "Number of PC from scATAC-seq multiome: 653"

7.3 Detecting posible doublets in scATAC and clusters with low number of cells

length(intersect(tonsil_ATAC_cell_barcode$cell_barcode,tonsil_RNA_annotation$barcode))
## [1] 647
possible_doublets_ATAC <- setdiff(tonsil_ATAC_cell_barcode$cell_barcode,tonsil_RNA_annotation$barcode)
length(possible_doublets_ATAC)
## [1] 6
barcodes_filter_groups <- tonsil_RNA_annotation[which(is.na(tonsil_RNA_annotation$annotation_level_5)),]$barcode

8 Merging selected quality cells

cells <- selecting_cells[!(selecting_cells %in% c(possible_doublets_ATAC,barcodes_filter_groups))]
PC_level5 <- subset(seurat_B_cell_lineage, 
                    cells = cells)

PC_level5$annotation_level_5 <- "unannotated"

# Renaming cells
PC_level5@meta.data[DZ_1.cells.use,]$annotation_level_5  <- "DZ_1"
PC_level5@meta.data[LZ_1.cells.use,]$annotation_level_5  <- "LZ_1"
PC_level5@meta.data[LZ_2.cells.use,]$annotation_level_5  <- "LZ_2"
PC_level5@meta.data[csMBC.cells.use,]$annotation_level_5  <- "csMBC"

PC_level5@meta.data[colnames(PC_level5),]$annotation_level_5  <- as.character(PC_level5@meta.data[colnames(PC_level5),]$annotation_level_5)

9 Integration of the data

PC_level5 <- PC_level5 %>%
  RunTFIDF() %>%
  FindTopFeatures(min.cutoff = 10) %>%
  RunSVD()

DepthCor(PC_level5)

PC_level5 <- RunUMAP(object = PC_level5, 
                             reduction = 'lsi', 
                             dims = 2:40)

DimPlot(PC_level5,
      group.by = "annotation_level_5",
      pt.size = 0.2)

DimPlot(PC_level5,
      group.by = "assay",
      pt.size = 0.2)

PC_level5 <- RunHarmony(
  object = PC_level5,
  dims = 2:40,
  group.by.vars = 'gem_id',
  reduction = 'lsi',
  assay.use = 'peaks_macs',
  project.dim = FALSE,
  max.iter.harmony = 20
)
PC_level5 <- RunUMAP(PC_level5, 
                             reduction = "harmony",
                             dims = 2:16)
DimPlot(PC_level5,
      group.by = "annotation_level_5",
      pt.size = 1)

DimPlot(PC_level5,
      group.by = "annotation_level_5", 
      split.by = "assay",
      pt.size = 0.2)

DimPlot(PC_level5,
      split.by = "age_group",
      pt.size = 0.2)

10 Save

saveRDS(PC_level5, path_to_save)

11 Session Information

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-apple-darwin13.4.0 (64-bit)
## Running under: macOS Big Sur 10.16
## 
## Matrix products: default
## BLAS/LAPACK: /Users/pauli/opt/anaconda3/envs/Motif_TF/lib/libopenblasp-r0.3.10.dylib
## 
## locale:
## [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] plyr_1.8.6         unikn_0.4.0        harmony_1.0        Rcpp_1.0.6         ggpubr_0.4.0       reshape2_1.4.4     forcats_0.5.0      stringr_1.4.0      dplyr_1.0.7        purrr_0.3.4        readr_1.4.0        tidyr_1.1.3        tibble_3.1.2       ggplot2_3.3.5      tidyverse_1.3.0    Signac_1.2.1       SeuratObject_4.0.2 Seurat_4.0.3       BiocStyle_2.16.1  
## 
## loaded via a namespace (and not attached):
##   [1] utf8_1.2.1             reticulate_1.20        tidyselect_1.1.1       htmlwidgets_1.5.3      grid_4.0.3             docopt_0.7.1           BiocParallel_1.22.0    Rtsne_0.15             munsell_0.5.0          codetools_0.2-17       ica_1.0-2              future_1.21.0          miniUI_0.1.1.1         withr_2.4.2            colorspace_2.0-2       knitr_1.30             rstudioapi_0.11        stats4_4.0.3           ROCR_1.0-11            ggsignif_0.6.0         tensor_1.5             listenv_0.8.0          labeling_0.4.2         slam_0.1-47            GenomeInfoDbData_1.2.3 polyclip_1.10-0        farver_2.1.0           rprojroot_2.0.2        parallelly_1.26.1      vctrs_0.3.8            generics_0.1.0         xfun_0.18              lsa_0.73.2             ggseqlogo_0.1          R6_2.5.0               GenomeInfoDb_1.24.2    bitops_1.0-7           spatstat.utils_2.2-0   assertthat_0.2.1       promises_1.2.0.1       scales_1.1.1           gtable_0.3.0           globals_0.14.0         goftest_1.2-2          rlang_0.4.11           RcppRoll_0.3.0         splines_4.0.3          rstatix_0.6.0          lazyeval_0.2.2         spatstat.geom_2.2-0    broom_0.7.2           
##  [52] BiocManager_1.30.10    yaml_2.2.1             abind_1.4-5            modelr_0.1.8           backports_1.1.10       httpuv_1.6.1           tools_4.0.3            bookdown_0.21          ellipsis_0.3.2         spatstat.core_2.2-0    RColorBrewer_1.1-2     BiocGenerics_0.34.0    ggridges_0.5.3         zlibbioc_1.34.0        RCurl_1.98-1.2         rpart_4.1-15           deldir_0.2-10          pbapply_1.4-3          cowplot_1.1.1          S4Vectors_0.26.0       zoo_1.8-9              haven_2.3.1            ggrepel_0.9.1          cluster_2.1.0          here_1.0.1             fs_1.5.0               magrittr_2.0.1         RSpectra_0.16-0        data.table_1.14.0      scattermore_0.7        openxlsx_4.2.4         lmtest_0.9-38          reprex_0.3.0           RANN_2.6.1             SnowballC_0.7.0        fitdistrplus_1.1-5     matrixStats_0.59.0     hms_0.5.3              patchwork_1.1.1        mime_0.11              evaluate_0.14          xtable_1.8-4           rio_0.5.16             sparsesvd_0.2          readxl_1.3.1           IRanges_2.22.1         gridExtra_2.3          compiler_4.0.3         KernSmooth_2.23-17     crayon_1.4.1           htmltools_0.5.1.1     
## [103] mgcv_1.8-33            later_1.2.0            lubridate_1.7.9        DBI_1.1.0              tweenr_1.0.1           dbplyr_1.4.4           MASS_7.3-53            Matrix_1.3-4           car_3.0-10             cli_3.0.0              parallel_4.0.3         igraph_1.2.6           GenomicRanges_1.40.0   pkgconfig_2.0.3        foreign_0.8-80         plotly_4.9.4.1         spatstat.sparse_2.0-0  xml2_1.3.2             XVector_0.28.0         rvest_0.3.6            digest_0.6.27          sctransform_0.3.2      RcppAnnoy_0.0.18       spatstat.data_2.1-0    Biostrings_2.56.0      rmarkdown_2.5          cellranger_1.1.0       leiden_0.3.8           fastmatch_1.1-0        uwot_0.1.10            curl_4.3.2             shiny_1.6.0            Rsamtools_2.4.0        lifecycle_1.0.0        nlme_3.1-150           jsonlite_1.7.2         carData_3.0-4          viridisLite_0.4.0      fansi_0.5.0            pillar_1.6.1           lattice_0.20-41        fastmap_1.1.0          httr_1.4.2             survival_3.2-7         glue_1.4.2             qlcMatrix_0.9.7        zip_2.1.1              png_0.1-7              ggforce_0.3.2          stringi_1.6.2          blob_1.2.1            
## [154] irlba_2.3.3            future.apply_1.7.0